In [1]:
import pandas as pd
import numpy as np
In [2]:
def prepare_data_to_model(path='../data/survival.csv'):
    data = pd.read_csv(path, index_col = 0).drop(['id','study','etype'], axis = 1)
    data['treatment'] = np.where(data['rx']=='Obs', 0, np.where(data['rx']=='Lev',1,2))
    data.drop('rx', axis=1, inplace = True)
    data = data.fillna(-1)
    return data.drop(['treatment','time'], axis=1), data['treatment'], data['time']

Zamieniam zmiennÄ… 'rx' na zmiennÄ… kategorycznÄ…, 'Obs' = 0, 'Lev'=1, 'Lev+FU'=2

Podział zbioru na treatment, target i feature

In [3]:
X, treatment, y = prepare_data_to_model()
In [4]:
X_nodup = X.drop_duplicates()#.reset_index(drop=True)
In [5]:
X_nodup
Out[5]:
sex age obstruct perfor adhere nodes status differ extent surg node4
1 1 43 0 0 0 5.0 1 2.0 3 0 1
3 1 63 0 0 0 1.0 0 2.0 3 0 0
5 0 71 0 0 1 7.0 1 2.0 2 0 1
7 0 66 1 0 0 6.0 1 2.0 3 1 1
9 1 69 0 0 0 22.0 1 2.0 3 1 1
... ... ... ... ... ... ... ... ... ... ... ...
1849 1 71 0 0 1 4.0 0 2.0 3 0 0
1851 0 72 0 0 0 1.0 0 2.0 3 0 0
1853 1 76 0 0 1 1.0 1 3.0 3 0 0
1855 0 48 1 0 0 4.0 0 2.0 3 1 1
1857 0 66 1 0 0 1.0 0 2.0 3 0 0

967 rows × 11 columns

XGBRegressor

Load model

In [6]:
import pickle

with open('pickles/opt_XGBTRegressor.pickle', 'rb') as file:
    xg = pickle.load(file)
The sklearn.utils.testing module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.
In [7]:
te, lb, ub = xg.estimate_ate(X, treatment, y)
# pred = xg.predict(X)
In [8]:
print('Average Treatment 1 Effect (XGBoost): {:.2f} ({:.2f}, {:.2f})'.format(te[0], lb[0], ub[0]))
print('Average Treatment 2 Effect (XGBoost): {:.2f} ({:.2f}, {:.2f})'.format(te[1], lb[1], ub[1]))
Average Treatment 1 Effect (XGBoost): 70.49 (25.09, 115.88)
Average Treatment 2 Effect (XGBoost): 70.89 (27.85, 113.93)

BrakDown

In [9]:
from dalex import Explainer

Treatment 1

In [10]:
func_pred = (lambda xg, x: xg.predict(x)[:,0])
In [11]:
exp = Explainer(model = xg, data = X, y =y, model_type= 'regression', predict_function = func_pred)
Preparation of a new explainer is initiated

  -> data              : 1858 rows 11 cols
  -> target variable   : Argument 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 1858 values
  -> model_class       : causalml.inference.meta.tlearner.XGBTRegressor (default)
  -> label             : not specified, model's class short name is taken instead (default)
  -> predict function  : <function <lambda> at 0x7f63b3e4d0e0> will be used
  -> predicted values  : min = -3061.38671875, mean = 70.48515523083115, max = 2418.54833984375
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -1997.54833984375, mean = 1467.060592885423, max = 4854.280029296875
  -> model_info        : package causalml

A new explainer has been created!
In [ ]:
for i, row in X_nodup.iterrows():
    print(i)
    break_down = exp.predict_parts(row, type='break_down')
    break_down.plot(max_vars=13)
1
3
5
7
9
11
13
15
17
19
21
23
25
27
29
31
32
33
35
37
39
41
42
43
45
47
49
51
53
55
57
59
61
63
65
67
69
71
73
75
77
79
81
83
85
87
89
91
93
95
97
99
101
103
105
107
109
111
113
114
115
117
119
121
123
125
127
129
131
133
135
137
139
141
143
145
147
149
151
153
155
157
158
159
161
163
165
166
167
169
171
173
175
177
179
181
183
185
187
189
191
192
193
195
196
197
199
201
203
205
207
209
210
211
213
215
217
219
220
221
223
225
227
229
231
233
235
237
239
241
243
245
247
249
251
253
255
257
259
261
263
264
265
267
269
271
273
275
277
279
280
281
283
285
287
289
291
293
295
297
299
301
303
305
307
309
310
311
313
315
316
317
319
321
323
325
326
327